home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Language/OS - Multiplatform Resource Library
/
LANGUAGE OS.iso
/
glass
/
glass.lha
/
GLASS
/
tm
/
refex.c
< prev
next >
Wrap
C/C++ Source or Header
|
1990-11-06
|
11KB
|
466 lines
/*
Copyright (C) 1990 C van Reewijk, email: dutentb.uucp!reeuwijk
This file is part of GLASS.
GLASS is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 1, or (at your option)
any later version.
GLASS is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GLASS; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
/* File: refex.c
*
* refex - File regular expression pattern matching
* and replacement
*
* By: Ozan S. Yigit (oz)
* Dept. of Computer Science
* York University
*
* Munged by C. van Reeuwijk to do shell style pattern matching.
* {} shell patterns are not handled.
*
* These routines are the PUBLIC DOMAIN equivalents
* of refex routines as found in 4.nBSD UN*X, with minor
* extensions.
*
* These routines are derived from various implementations
* found in software tools books, and Conroy's grep. They
* are NOT derived from licensed/restricted software.
* For more interesting/academic/complicated implementations,
* see Henry Spencer's refexp routines, or GNU Emacs pattern
* matching module.
*
* Routines:
* ref_comp: compile a regular expression into a DFA.
*
* char *ref_comp(s)
* char *s;
*
* ref_exec: execute the DFA to match a pattern.
*
* int ref_exec(s)
* char *s;
*
* ref_subs: substitute the matched portions in
* a new string.
*
* int ref_subs(src, dst)
* char *src;
* char *dst;
*
* Regular Expressions:
*
* [1] char matches itself, unless it is a special
* character (metachar): . \ [ ] * + ^ $
*
* [2] ? matches any character.
*
* [3] \ matches the character following it, except
* when followed by a digit 1 to 9.
* (see [7], [8] and [9])
* It is used as an escape character for all
* other meta-characters, and itself. When used
* in a set ([4]), it is treated as an ordinary
* character.
*
* [4] [set] matches one of the characters in the set.
* If the first character in the set is "^",
* it matches a character NOT in the set. A
* shorthand S-E is used to specify a set of
* characters S upto E, inclusive. The special
* characters "]" and "-" have no special
* meaning if they appear as the first chars
* in the set.
* examples: match:
*
* [a-z] any lowercase alpha
*
* [^]-] any char except ] and -
*
* [^A-Z] any char except uppercase
* alpha
*
* [a-zA-Z] any alpha
*
* [5] * matches the longest possible span of zero or more
* arbitrary characters.
*
* [6] a regular expression in the form [1] to [10], enclosed
* as (form) matches what form matches. The enclosure
* creates a set of tags, used for pattern substution.
* The tagged forms are numbered starting from 1.
*
* [7] a composite regular expression xy where x and y
* are in the form [1] to [7] matches x followed by
* a match for y.
*
* Acknowledgements:
*
* HCR's Hugh Redelmeier has been most helpful in various
* stages of development. He convinced me to include BOW
* and EOW constructs, originally invented by Rob Pike at
* the University of Toronto.
*
* References:
* Software tools Kernighan & Plauger
* Software tools in Pascal Kernighan & Plauger
* Grep [rsx-11 C dist] David Conroy
* ed - text editor Un*x Programmer's Manual
* Advanced editing on Un*x B. W. Kernighan
* RegExp routines Henry Spencer
*
* Notes:
*
* This implementation uses a bit-set representation for character
* classes for speed and compactness. Each character is represented
* by one bit in a 128-bit block. Thus, CCL or NCL always takes a
* constant 16 bytes in the internal dfa, and ref_exec does a single
* bit comparison to locate the character in the set.
*
* Examples:
*
* pattern: fo*
* compile: CHR f CHR o star END
* matches: fo foo fooo foobar fobar foxx ...
*
* pattern: fo[ob]a[rz]
* compile: CHR f CHR o CCL bitset CHR a CCL bitset END
* matches: fobar fooar fobaz fooaz
*
* pattern: (foo)[1-3]
* compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset END
* matches: foo1 foo2 foo3
*
* pattern: (fo*)-
* compile: BOT 1 CHR f CHR o STAR EOT 1 CHR - END
* matches: foo- fo- fob- foobar-
*
*/
#include <stdio.h>
#include "refex.h"
#define MAXDFA 1024
#define MAXTAG 10
#define OKP 1
#define NOP 0
#define CHR 1
#define ANY 2
#define CCL 3
#define NCL 4
#define BOT 7
#define EOT 8
#define STAR 13
#define END 0
/*
* The following defines are not meant
* to be changeable. They are for readibility
* only.
*
*/
#define MAXCHR 128
#define CHRBIT 8
#define BITBLK MAXCHR/CHRBIT
#define BLKIND 0170
#define BITIND 07
#define ASCIIB 0177
typedef /*unsigned*/ char CHAR;
static int tagstk[MAXTAG]; /* subpat tag stack..*/
static CHAR dfa[MAXDFA]; /* automaton.. */
static int sta = NOP; /* status of lastpat */
static CHAR bittab[BITBLK]; /* bit table for CCL */
static void chset( c )
register CHAR c;
{
bittab[((c)&BLKIND)>>3] |= 1<<((c)&BITIND);
}
#define badpat(x) return(*dfa = END, x)
#define store(x) *mp++ = x
char *ref_comp(pat)
char *pat;
{
register char *p; /* pattern pointer */
register CHAR *mp=dfa; /* dfa pointer */
register int tagi = 0; /* tag stack index */
register int tagc = 1; /* actual tag count */
register int n;
int c1, c2;
if (!pat || !*pat)
if (sta)
return( 0 );
else
badpat("No previous regular expression");
sta = NOP;
for (p = pat; *p != '\0'; p++) {
switch(*p){
case '?': /* match any char.. */
store(ANY);
break;
case '[': /* match char class..*/
if (*++p == '^') {
store(NCL);
p++;
}
else
store(CCL);
if (*p == '-') /* real dash */
chset(*p++);
if (*p == ']') /* real brac */
chset(*p++);
while (*p && *p != ']') {
if (*p == '-' && *(p+1) && *(p+1) != ']') {
p++;
c1 = *(p-2) + 1;
c2 = *p++;
while (c1 <= c2)
chset(c1++);
}
else
chset(*p++);
}
if (!*p)
badpat("Missing ]");
for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
store(bittab[n]);
break;
case '*': /* match 0 or more.. */
store(STAR);
break;
case '(':
if( tagc>=MAXTAG ){
badpat( "Too many () pairs" );
}
tagstk[++tagi] = tagc;
store(BOT);
store(tagc++);
break;
case ')':
if( tagi<=0 ){
badpat( "Unmatched )" );
}
store( EOT );
store( tagstk[tagi--] );
break;
case '\\': /* tags, backrefs .. */
p++;
store(CHR);
store(*p);
break;
default : /* an ordinary char */
store(CHR);
store(*p);
break;
}
}
if (tagi > 0)
badpat("Unmatched \\(");
store(END);
sta = OKP;
return( 0 );
}
static char *bopat[MAXTAG];
static char *eopat[MAXTAG];
static char *pmatch();
/*
* ref_exec:
* execute dfa to match the string.
*
* If the pattern matches, bopat[0] and eopat[0] are set
* to the beginning and the end of the matched fragment,
* respectively.
*
* return 1 if the compiled pattern matches the given string,
* or else 0.
*/
int ref_exec(lp)
register char *lp;
{
register char *ep = 0;
register CHAR *ap = dfa;
bopat[0] = 0;
bopat[1] = 0;
bopat[2] = 0;
bopat[3] = 0;
bopat[4] = 0;
bopat[5] = 0;
bopat[6] = 0;
bopat[7] = 0;
bopat[8] = 0;
bopat[9] = 0;
if( *ap == END ) return( 0 ); /* munged automaton. fail always */
ep = pmatch( lp, ap );
if (!ep)
return( 0 );
bopat[0] = lp;
eopat[0] = ep;
return(1);
}
/*
* pmatch:
* internal routine for the hard part
*
* This code is mostly snarfed from an early
* grep written by David Conroy. The backref and
* tag stuff, and various other mods are by oZ.
*
* At the end of a successful match, bopat[n] and eopat[n]
* are set to the beginning and end of subpatterns matched
* by tagged expressions (n = 1 to 9).
*
*/
#define isinset(x,y) ((x)[((y)&BLKIND)>>3] & (1<<((y)&BITIND)))
static char *pmatch( lp, ap )
register char *lp;
register CHAR *ap;
{
register char *e; /* extra pointer for CLO */
register int op;
register int c;
char *are; /* to save the line ptr. */
for(;;){
op = *ap++;
if( op == END ) break;
switch(op) {
case CHR:
if (*lp++ != *ap++)
return( 0 );
break;
case ANY:
if (*lp++ == '\0' )
return( 0 );
break;
case CCL:
c = *lp++;
if (!isinset(ap,c))
return( 0 );
ap += BITBLK;
break;
case NCL:
c = *lp++;
if (isinset(ap,c))
return( 0 );
ap += BITBLK;
break;
case BOT:
bopat[*ap++] = lp;
break;
case EOT:
eopat[*ap++] = lp;
break;
case STAR:
are = lp;
while (*lp) lp++;
while( lp >= are ){
if( e = pmatch( lp, ap) ) return( e );
--lp;
}
return( 0 );
default:
fprintf( stderr, "ref_exec: bad dfa opcode: %d.\n", op );
exit( 2 );
}
}
if( *lp != '\0' ) return( 0 );
return( lp );
}
/* Substitute the matched portions of the 'src' in 'dst'.
* The following special characters are recognized in 'src':
*
* &:
* Substitute the entire matched pattern.
*
* \digit:
* Substitute a subpattern, with the given tag number. Tags are
* numbered from 1 to 9. If the particular tagged subpattern
* does not exist, the empty string is substituted.
*/
void ref_subs( src, dst )
register char *src;
register char *dst;
{
register char c;
register int pin;
register char *bp;
register char *ep;
if( !*src || !bopat[0] )
return;
while( c = *src++ ){
switch(c) {
case '&':
pin = 0;
break;
case '\\':
c = *src++;
if (c >= '0' && c <= '9') {
pin = c - '0';
break;
}
default:
*dst++ = c;
continue;
}
bp = bopat[pin];
ep = eopat[pin];
if( bp && ep ){
while( *bp != '\0' && bp < ep )
*dst++ = *bp++;
}
}
*dst = '\0';
}